The prices of the stocks of companies listed under a global exchange are influenced by a variety of factors, with the company's financial performance, innovations and collaborations, and market sentiment being factors that play a significant role. News and media reports can rapidly affect investor perceptions and, consequently, stock prices in the highly competitive financial industry. With the sheer volume of news and opinions from a wide variety of sources, investors and financial analysts often struggle to stay updated and accurately interpret its impact on the market. As a result, investment firms need sophisticated tools to analyze market sentiment and integrate this information into their investment strategies.
With an ever-rising number of news articles and opinions, an investment startup aims to leverage artificial intelligence to address the challenge of interpreting stock-related news and its impact on stock prices. They have collected historical daily news for a specific company listed under NASDAQ, along with data on its daily stock price and trade volumes.
As a member of the Data Science and AI team in the startup, you have been tasked with analyzing the data, developing an AI-driven sentiment analysis system that will automatically process and analyze news articles to gauge market sentiment, and summarizing the news at a weekly level to enhance the accuracy of their stock price predictions and optimize investment strategies. This will empower their financial analysts with actionable insights, leading to more informed investment decisions and improved client outcomes.
Date : The date the news was releasedNews : The content of news articles that could potentially affect the company's stock priceOpen : The stock price (in \$) at the beginning of the dayHigh : The highest stock price (in \$) reached during the dayLow : The lowest stock price (in \$) reached during the dayClose : The adjusted stock price (in \$) at the end of the dayVolume : The number of shares traded during the dayLabel : The sentiment polarity of the news content# installing the sentence-transformers for word embeddings
!pip install -U sentence-transformers transformers gensim tqdm -q
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
# for manipulating string data
import string
# library for regular expression
import re
# natural language toolkit and library for stemming
import nltk
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
# library for word cloud and stopwords
from wordcloud import WordCloud, STOPWORDS
# Word2Vec
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
# Sentence Transformer
from sentence_transformers import SentenceTransformer
# pytorch
import torch
# library for data split and model evaluation
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer
# preprocessing
from sklearn.preprocessing import StandardScaler
# class weight and stratification
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight
# lib for classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# ignore warnings
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')
stock_news = pd.read_csv('/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/stock_news.csv')
data = stock_news.copy()
# display max width of the column
pd.set_option('display.max_colwidth', None)
# display first 5 rows
data.head(5)
# checking the shape of the dataframe
data.shape
# structural information about the dataframe
data.info()
# checking null values
data.isnull().sum()
# checking for duplicate values
data.duplicated().sum()
# Statistical summary of data
data.describe(include='all').T
# target variable distribution
data['Label'].value_counts()
# target variable distribution percentage
data['Label'].value_counts(normalize=True)
News and Date columns are of type object and rest of the columns are of type numericlabel indicates that our dataset is imbalanced with 48% neutral sentiment, 28% negative sentiment and only 22% positive sentiment.# numeric columns
num_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
# We will drop rows with duplicate date
# We will use the table for numerical column analysis
DateDeDupedTable = data.drop_duplicates(subset=['Date'], keep='first')
DateDeDupedTable.shape
DateDeDupedTable.describe().T
# news length - total number of words
data['news_length'] = data['News'].str.split().str.len()
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, top_n=None, figsize = (10, 5), hide_label = False):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
top_n: displays the top n category levels (default is None, i.e., display all levels)
figsize: size of the figure (default is (10, 5))
hide_label: whether to hide the y-axis labels or not (default is False)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if top_n is None:
plt.figure(figsize=figsize)
else:
plt.figure(figsize=(top_n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:top_n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
elif hide_label == True:
label = ""
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a triangle will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogr
### Function to plot distributions
def distribution_plot_wrt_target(data, predictor, target):
target_uniq = data[target].unique()
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
axs[0].set_title(f"Boxplot - target vs {predictor}")
sns.boxplot(data=data, x=target, y=predictor, ax=axs[0], palette="gist_rainbow")
axs[1].set_title(f"Boxplot (without outliers) - target vs {predictor}")
sns.boxplot(
data=data,
x=target,
y=predictor,
ax=axs[1],
showfliers=False,
palette="gist_rainbow",
)
plt.tight_layout()
plt.show()
### Function generate word cloud
def generate_wordcloud(text, title):
all_text = " ".join(text)
wordcloud = WordCloud(
width=1000,
height=500,
background_color="black",
stopwords=set(STOPWORDS),
).generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title(title)
plt.show()
# distribution of label
labeled_barplot(data, 'Label', perc=True)
# distribution of news
labeled_barplot(data, 'Date', figsize=(30, 8), hide_label=True)
for col in num_cols:
histogram_boxplot(DateDeDupedTable, col)
data['news_length'].describe()
histogram_boxplot(data, 'news_length')
Note: The above points are listed to provide guidance on how to approach bivariate analysis. Analysis has to be done beyond the above listed points to get maximum scores.
sns.heatmap(data.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.show()
sns.pairplot(data, hue='Label', palette='bright', corner=True)
plt.show()
We notice that price variables are highly correlated to each other
for col in num_cols:
distribution_plot_wrt_target(data, col, 'Label')
for col in num_cols:
plt.figure(figsize=(30, 10))
sns.lineplot(data=data, x='Date', y=col, hue='Label', style="Label", palette="tab10", errorbar=None, markers=True, dashes=False, linewidth=2.5)
plt.xticks(rotation=90)
plt.title(f'{col} Price vs Date')
plt.tight_layout()
plt.show()
for sent in [-1, 0, 1]:
news_text = data[data['Label'] == sent]['News'].tolist()
combined_text = " ".join(news_text) # Convert list to a single string
generate_wordcloud(
combined_text,
f"Sentiment: {'Positive' if sent == 1 else 'Neutral' if sent == 0 else 'Negative'}")
Volume has right skewed distribution. On average 115.7 million stocks are traded per day.Open, High, Low,Close are correlated to each other with correlation score 1. There is no correlation between price and volumedef remove_special_chars(text):
pattern = '[^A-Za-z0-9]+'
# replace special character that matches the patter with a black string
new_text = re.sub(pattern, ' ', text)
return new_text
data['clean_text'] = data['News'].apply(remove_special_chars)
data.loc[0:3, ['News', 'clean_text']]
data['clean_text'] = data['clean_text'].str.lower()
data['clean_text'] = data['clean_text'].str.strip()
data.loc[0:3, ['News', 'clean_text']]
def remove_stopwords(text):
words = text.split()
new_text = [word for word in words if word not in STOPWORDS]
return ' '.join(new_text)
data['clean_text'] = data['clean_text'].apply(remove_stopwords)
data.loc[0:3, ['News','clean_text']]
ps = PorterStemmer()
def apply_stemming(text):
words = text.split()
new_text = [ps.stem(word) for word in words]
return ' '.join(new_text)
data['clean_text'] = data['clean_text'].apply(apply_stemming)
data.loc[0:3, ['News','clean_text']]
# get the list of the words
words_list = data['clean_text'].apply(lambda x: x.split()).to_list()
w2v = Word2Vec(words_list, vector_size=300, min_count=1, window=5, workers=6)
print('Word2Vec vocubulary length:', len(list(w2v.wv.key_to_index)))
w2v.wv.vector_size
# function to calculate average embedding of a text content
def average_word_vectorizer(doc):
feature_vector = np.zeros((300,), dtype="float64")
n_words = 0
for word in doc:
if word in w2v.wv:
n_words += 1
feature_vector = np.add(feature_vector, w2v.wv[word])
if n_words > 0:
feature_vector = np.divide(feature_vector, n_words)
return feature_vector
df_w2v = pd.DataFrame(data['clean_text'].apply(average_word_vectorizer).to_list(), columns=[f'feature_{i}' for i in range(300)])
df_w2v.head()
df_w2v.shape
glove_file_name = '/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/glove.6B.100d.txt.word2vec'
glove_model = KeyedVectors.load_word2vec_format(glove_file_name, binary=False)
glove_model.vector_size
print('GloVe model vocabulary length:', len(list(glove_model.key_to_index)))
def average_word_vectorizer_glove(doc):
feature_vector = np.zeros((100,), dtype="float64")
n_words = 0
for word in doc:
if word in glove_model:
n_words += 1
feature_vector = np.add(feature_vector, glove_model[word])
if n_words > 0:
feature_vector = np.divide(feature_vector, n_words)
return feature_vector
df_glove = pd.DataFrame(data['clean_text'].apply(average_word_vectorizer_glove).to_list(), columns=[f'feature_{i}' for i in range(100)])
df_glove.head()
df_glove.shape
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# embedding using sentence transformer:
# to save GPU/CPU load we will save the embeddings after encoding
# we will load saved encoding
file_path = '/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/sentence_embeddings.npy'
if not os.path.exists(file_path):
embeddings = model.encode(data['News'], device=device, show_progress_bar=True)
np.save('/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/sentence_embeddings.npy', embeddings)
loaded_embeddings = np.load('/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/sentence_embeddings.npy')
loaded_embeddings.shape
# creating a function to plot the confusion matrix
def plot_confusion_matrix(actual, predicted, labels=['Neutral', 'Positive', 'Negative'], title='Confusion Matrix'):
cm = confusion_matrix(actual, predicted)
plt.figure(figsize = (5, 4))
label_list = labels
sns.heatmap(cm, annot = True, fmt = '.0f', xticklabels = label_list, yticklabels = label_list)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title(title)
plt.show()
# independent and dependent variable initialization
X = df_w2v
y = data['Label']
oob_score true. oob_score (Out-of-Bag Score) is a validation accuracy estimate in Random Forest, calculated using the data samples that were not included in the bootstrap training of each tree.# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train.shape, X_test.shape
# target variable remapping
y_train = y_train.map({0:0, 1:1, -1:2})
y_test = y_test.map({0:0, 1:1, -1:2})
y_train.value_counts()
y_test.value_counts()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
class_weight = compute_class_weight(class_weight='balanced', y = y_train, classes=np.unique(y_train))
class_weight = dict(zip(np.unique(y_train), class_weight))
class_weight
rf_w2v = RandomForestClassifier(random_state=42, class_weight=class_weight, n_jobs=-1, oob_score=True, bootstrap=True)
rf_w2v.fit(X_train_scaled, y_train)
y_train_pred = rf_w2v.predict(X_train_scaled)
y_test_pred = rf_w2v.predict(X_test_scaled)
df_models = pd.DataFrame({'train':[accuracy_score(y_train, y_train_pred)], 'validation': [rf_w2v.oob_score_], 'test': [accuracy_score(y_test, y_test_pred)]}, index = ["RF_W2V"])
df_models
plot_confusion_matrix(y_train, y_train_pred, title='Confusion Matrix - Train')
plot_confusion_matrix(y_test, y_test_pred, title='Confusion Matrix - Test')
# Classification Report - Train
print(classification_report(y_train, y_train_pred))
# Classification Report - Test
print(classification_report(y_test, y_test_pred))
params = {
'n_estimators': [30,40,50],
'max_depth': [1,2,3],
'min_samples_split': [4,5,6],
'criterion': ['gini','entropy'],
'min_samples_leaf': [5,7],
'ccp_alpha':[0.07],
'oob_score':[True],
'bootstrap':[True]
}
rf_w2v = RandomForestClassifier(random_state=42, class_weight=class_weight, n_jobs=-1, oob_score=True, bootstrap=True)
random_search = RandomizedSearchCV(
n_iter = 30,
estimator = rf_w2v,
param_distributions=params,
cv=5,
random_state=42,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
random_search.fit(X_train_scaled, y_train)
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)
best_model = random_search.best_estimator_
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)
df_models = pd.concat([df_models, pd.DataFrame({'train':[accuracy_score(y_train, y_train_pred)], 'validation': [best_model.oob_score_],'test': [accuracy_score(y_test, y_test_pred)]}, index = ["RF_W2V_Tuned"])])
df_models
plot_confusion_matrix(y_train, y_train_pred, title='Confusion Matrix - Train')
plot_confusion_matrix(y_test, y_test_pred, title='Confusion Matrix - Test')
print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))
# independent and dependent variable initialization
X = df_w2v
y = data['Label']
# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)
# target variable remapping
y_train = y_train.map({0:0, 1:1, -1:2})
y_test = y_test.map({0:0, 1:1, -1:2})
y_val = y_val.map({0:0, 1:1, -1:2})
y_train.value_counts(), y_val.value_counts(), y_test.value_counts()
X_train.shape, X_val.shape, X_test.shape
class_weight = compute_class_weight(class_weight='balanced', y = y_train, classes=np.unique(y_train))
class_weight = dict(zip(np.unique(y_train), class_weight))
class_weight
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
svm_model = SVC(class_weight=class_weight,random_state=42, decision_function_shape='ovo')
svm_model.fit(X_train_scaled, y_train)
y_train_pred = svm_model.predict(X_train_scaled)
y_val_pred = svm_model.predict(X_val_scaled)
y_test_pred = svm_model.predict(X_test_scaled)
df_models = pd.concat([df_models,pd.DataFrame({'train':accuracy_score(y_train, y_train_pred), 'validation':accuracy_score(y_val, y_val_pred), 'test': accuracy_score(y_test, y_test_pred)}, index = ['W2V_SVM'])])
df_models
plot_confusion_matrix(y_train, y_train_pred, title="Confusion Matrix - Train")
plot_confusion_matrix(y_val, y_val_pred, title="Confusion Matrix - Validation")
plot_confusion_matrix(y_test, y_test_pred, title="Confusion Matrix - Test")
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))
print(classification_report(y_test, y_test_pred))
svm_model = SVC(class_weight=class_weight,random_state=42, decision_function_shape='ovo')
param_dist = {
# lower value will reduce overfitting
'C': np.logspace(-3, -1, 20),
# used in non-linear kernel, lower value will reduce overfitting
'gamma': np.logspace(-3, -1, 20),
'kernel': ['linear', 'rbf']
}
# Train the model with RandomizedSearchCV
random_search = RandomizedSearchCV(
svm_model,
param_distributions=param_dist,
n_iter=30,
cv=3,
scoring='accuracy',
random_state=42,
n_jobs=-1,
verbose=1
)
random_search.fit(X_train_scaled, y_train)
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)
y_train_pred = best_model.predict(X_train_scaled)
y_val_pred = best_model.predict(X_val_scaled)
y_test_pred = best_model.predict(X_test_scaled)
df_models = pd.concat([df_models,
pd.DataFrame({'train':accuracy_score(y_train, y_train_pred), 'validation':accuracy_score(y_val, y_val_pred), 'test': accuracy_score(y_test, y_test_pred)}, index=['W2V_SVM_Tuned'])])
df_models
plot_confusion_matrix(y_train, y_train_pred, title="Confusion Matrix - Train")
plot_confusion_matrix(y_val, y_val_pred, title="Confusion Matrix - Validation")
plot_confusion_matrix(y_test, y_test_pred, title="Confusion Matrix - Test")
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))
print(classification_report(y_test, y_test_pred))
We will start with SVM model for the GloVe embedding
# data splitting
X = df_glove
y = data['Label']
# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)
X_train.shape, X_val.shape, X_test.shape
# target variable remapping
y_train = y_train.map({0:0, 1:1, -1:2})
y_val = y_val.map({0:0, 1:1, -1:2})
y_test = y_test.map({0:0, 1:1, -1:2})
y_train.value_counts(), y_val.value_counts(), y_test.value_counts()
class_weight = compute_class_weight(class_weight='balanced', y = y_train, classes=np.unique(y_train))
class_weight = dict(zip(np.unique(y_train), class_weight))
class_weight
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
svm_glove = SVC(class_weight=class_weight,random_state=42, decision_function_shape='ovo')
svm_glove.fit(X_train_scaled, y_train)
y_train_pred = svm_glove.predict(X_train_scaled)
y_val_pred = svm_glove.predict(X_val_scaled)
y_test_pred = svm_glove.predict(X_test_scaled)
df_models = pd.concat([df_models,
pd.DataFrame({
'train':[accuracy_score(y_train, y_train_pred)],
'validation':[accuracy_score(y_val, y_val_pred)],
'test': [accuracy_score(y_test, y_test_pred)]}, index = ['GloVe_SVM'])])
df_models
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))
print(classification_report(y_test, y_test_pred))
plot_confusion_matrix(y_train, y_train_pred)
plot_confusion_matrix(y_val, y_val_pred)
plot_confusion_matrix(y_test, y_test_pred)
svm_model = SVC(class_weight=class_weight,random_state=42, decision_function_shape='ovo')
param_dist = {
# lower value will reduce overfitting
'C': np.logspace(-2, -1, 7),
# used in non-linear kernel, lower value will reduce overfitting
'gamma': np.logspace(-3, 1, 10),
'kernel': ['linear','rbf','poly', 'sigmoid']
}
# Train the model with RandomizedSearchCV
random_search = RandomizedSearchCV(
svm_model,
param_distributions=param_dist,
n_iter=30,
cv=3,
scoring='accuracy',
random_state=42,
n_jobs=-1,
verbose=1
)
random_search.fit(X_train_scaled, y_train)
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)
best_model.fit(X_train_scaled, y_train)
y_train_pred = best_model.predict(X_train_scaled)
y_val_pred = best_model.predict(X_val_scaled)
y_test_pred = best_model.predict(X_test_scaled)
df_models= pd.concat([df_models,
pd.DataFrame({
'train':[accuracy_score(y_train, y_train_pred)],
'validation':[accuracy_score(y_val, y_val_pred)],
'test': [accuracy_score(y_test, y_test_pred)]}, index = ['GloVe_SVM_Tuned'])])
df_models
plot_confusion_matrix(y_train, y_train_pred)
plot_confusion_matrix(y_val, y_val_pred)
plot_confusion_matrix(y_test, y_test_pred)
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))
print(classification_report(y_test, y_test_pred))
As our dataset is small we will set oob_score true for validation and only use train and test split.
# data splitting
X = df_glove
y = data['Label']
# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train.shape, X_test.shape
y_train = y_train.map({0:0, 1:1, -1:2})
y_test = y_test.map({0:0, 1:1, -1:2})
y_train.value_counts(), y_test.value_counts()
class_weight = compute_class_weight(class_weight='balanced', y = y_train, classes=np.unique(y_train))
class_weight = dict(zip(np.unique(y_train), class_weight))
class_weight
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
rf_glove = RandomForestClassifier(random_state=42, class_weight=class_weight, n_jobs=-1, oob_score=True, bootstrap=True)
rf_glove.fit(X_train_scaled, y_train)
y_train_pred = rf_glove.predict(X_train_scaled)
y_test_pred = rf_glove.predict(X_test_scaled)
df_models = pd.concat([ df_models,
pd.DataFrame({'train':[accuracy_score(y_train, y_train_pred)], 'validation':[rf_glove.oob_score_], 'test': [accuracy_score(y_test, y_test_pred)]}, index = ['GloVe_RF'])])
df_models
plot_confusion_matrix(y_train, y_train_pred)
plot_confusion_matrix(y_test, y_test_pred)
print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))
params = {
'n_estimators': [30,40,50],
'max_depth': [1,2,3],
'min_samples_split': [3,4,5],
'criterion': ['gini','entropy'],
'min_samples_leaf': [17,19,20],
'ccp_alpha':[0.08],
'oob_score':[True],
'bootstrap':[True]
}
rf_glove = RandomForestClassifier(random_state=42, class_weight=class_weight, n_jobs=-1, oob_score=True, bootstrap=True)
random_search = RandomizedSearchCV(
n_iter = 30,
estimator = rf_glove,
param_distributions=params,
cv=5,
random_state=42,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
random_search.fit(X_train_scaled, y_train)
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)
best_model = random_search.best_estimator_
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)
df_models = pd.concat([df_models,
pd.DataFrame({
'train':[accuracy_score(y_train, y_train_pred)],
'validation':[best_model.oob_score_],
'test': [accuracy_score(y_test, y_test_pred)]}, index = ['GloVe_RF_Tuned'])])
df_models
plot_confusion_matrix(y_train, y_train_pred)
plot_confusion_matrix(y_test, y_test_pred)
print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))
oob_score true and use it for estimated validation score.# data split
X = np.load('/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/sentence_embeddings.npy')
y = data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train.shape, X_test.shape
# target variable remapping
y_train = y_train.map({0:0, 1:1, -1:2})
y_test = y_test.map({0:0, 1:1, -1:2})
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
class_weight = compute_class_weight(class_weight='balanced', y = y_train, classes=np.unique(y_train))
class_weight = dict(zip(np.unique(y_train), class_weight))
class_weight
rf_sent = RandomForestClassifier(random_state=42, class_weight=class_weight, n_jobs=-1, oob_score=True, bootstrap=True)
rf_sent.fit(X_train_scaled, y_train)
y_train_pred = rf_sent.predict(X_train_scaled)
y_test_pred = rf_sent.predict(X_test_scaled)
df_models = pd.concat([ df_models,
pd.DataFrame({'train':[accuracy_score(y_train, y_train_pred)], 'validation':[rf_sent.oob_score_], 'test': [accuracy_score(y_test, y_test_pred)]}, index = ['Sent_RF'])])
df_models
plot_confusion_matrix(y_train, y_train_pred)
plot_confusion_matrix(y_test, y_test_pred)
print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))
params = {
'n_estimators': [20,30,40],
'max_depth': [2,3,4],
'min_samples_split': [17,18,19],
'min_samples_leaf': [20,21,22],
'oob_score':[True],
'bootstrap':[True],
'criterion': ['gini','entropy'],
'ccp_alpha':[0.09,0.15,0.2]
}
rf_sent = RandomForestClassifier(random_state=42, class_weight=class_weight, n_jobs=-1, oob_score=True, bootstrap=True)
random_search = RandomizedSearchCV(
n_iter = 30,
estimator = rf_sent,
param_distributions=params,
cv=5,
random_state=42,
scoring='accuracy'
)
random_search.fit(X_train_scaled, y_train)
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)
best_model = random_search.best_estimator_
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)
df_models = pd.concat([df_models,
pd.DataFrame({'train':[accuracy_score(y_train, y_train_pred)],
'validation':[best_model.oob_score_],
'test': [accuracy_score(y_test, y_test_pred)]}, index = ['Sent_RF_Tuned'])])
df_models
plot_confusion_matrix(y_train, y_train_pred)
plot_confusion_matrix(y_test, y_test_pred)
# data split
X = np.load('/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/sentence_embeddings.npy')
y = data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
X_train.shape, X_val.shape, X_test.shape
y_train = y_train.map({0:0, 1:1, -1:2})
y_val = y_val.map({0:0, 1:1, -1:2})
y_test = y_test.map({0:0, 1:1, -1:2})
y_train.value_counts(), y_val.value_counts(), y_test.value_counts()
class_weight = compute_class_weight(class_weight='balanced', y = y_train, classes=np.unique(y_train))
class_weight = dict(zip(np.unique(y_train), class_weight))
class_weight
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
svm_model = SVC(class_weight=class_weight,random_state=42, decision_function_shape='ovo')
svm_model.fit(X_train_scaled, y_train)
y_train_pred = svm_model.predict(X_train_scaled)
y_val_pred = svm_model.predict(X_val_scaled)
y_test_pred = svm_model.predict(X_test_scaled)
df_models = pd.concat([df_models,
pd.DataFrame({
'train': accuracy_score(y_train, y_train_pred),
'validation':accuracy_score(y_val, y_val_pred),
'test': accuracy_score(y_test, y_test_pred)}, index = ['Sent_SVM'])])
df_models
plot_confusion_matrix(y_train, y_train_pred)
plot_confusion_matrix(y_val, y_val_pred)
plot_confusion_matrix(y_test, y_test_pred)
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))
print(classification_report(y_test, y_test_pred))
svm_model = SVC(class_weight=class_weight,random_state=42, decision_function_shape='ovo')
param_dist = {
'C': np.logspace(-4, -2, 10), # Regularization parameter
'gamma': np.logspace(-3, 1, 10), # Kernel coefficient
'kernel': ['linear', 'rbf'] # Different kernel functions
}
# Train the model with RandomizedSearchCV
random_search = RandomizedSearchCV(
svm_model,
param_distributions=param_dist,
n_iter=30,
cv=3,
scoring='accuracy',
random_state=42,
n_jobs=-1,
verbose=1
)
random_search.fit(X_train_scaled, y_train)
svm_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)
svm_model.fit(X_train_scaled, y_train)
y_train_pred = svm_model.predict(X_train_scaled)
y_val_pred = svm_model.predict(X_val_scaled)
y_test_pred = svm_model.predict(X_test_scaled)
df_models = pd.concat([df_models,
pd.DataFrame({
'train':[accuracy_score(y_train, y_train_pred)],
'validation':[accuracy_score(y_val, y_val_pred)],
'test':[accuracy_score(y_test, y_test_pred)]}, index = ['Sent_SVM_Tuned'])])
df_models
plot_confusion_matrix(y_train, y_train_pred)
plot_confusion_matrix(y_val, y_val_pred)
plot_confusion_matrix(y_test, y_test_pred)
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))
print(classification_report(y_test, y_test_pred))
df_models
RF_W2V_Tuned) using Word2Vector embeddings achieves the highest 44.7% accuracy in the test dataset. The tuned Support Vector Machine(SVM) model using Sentence Transformer emebeddings also achieves 44.7% accuracy in the test dataset.Sent_SVM_Tuned) using Sentence Transformer embedding scores around 72% accuracy in training dataset and 61% accuracy in the validation dataset.RF_W2V_Tuned) using Word2Vec embdedding, however, achieves slightly lower accuracy in training dataset, around 61%. It scores only 32% accuracy against the validation dataset.Sent_SVM_Tuned) has the best performance against the training, validation and test dataset. This model still overfits, however, it generalizes better than any other models we tried. The overfitting could be due to small dataset.Sent_SVM_Tuned) using SentenceTransformer embedding.y_test_pred = svm_model.predict(X_test_scaled)
accuracy_score(y_test, y_test_pred)
df_models.iloc[11:]
Important Note: It is recommended to run this section of the project independently from the previous sections in order to avoid runtime crashes due to RAM overload.
# GPU llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.78 --force-reinstall --upgrade --no-cache-dir
# import libraries for llm models
import torch
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Importing the library for data manipulation
import pandas as pd
import numpy as np
# For progress bar related functionalities
from tqdm import tqdm
tqdm.pandas()
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin" # the model is in bin format
# Download the model to the /content folder in Colab
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
print(f"Model downloaded to: {model_path}")
# Load the model with T4-optimized settings
llama_model = Llama(
model_path=model_path,
n_threads=2,
n_gpu_layers=41,
n_batch=512,
n_ctx=4096
)
data["Date"] = pd.to_datetime(data['Date']) # Convert the 'Date' column to datetime format.
# Group the data by week using the 'Date' column.
weekly_grouped = data.groupby(pd.Grouper(key='Date', freq='W'))
weekly_grouped = weekly_grouped.agg(
{
'News': lambda x: ' || '.join(x) # Join the news values with '||' separator.
}
).reset_index()
print(weekly_grouped.shape)
pd.set_option('display.max_colwidth', None)
weekly_grouped.head(1)
# creating a copy of the data
weekly_data = weekly_grouped.copy()
token_count = weekly_data['News'].apply(lambda text: len(llama_model.tokenize(text.encode("utf-8"), add_bos=True)))
print("Max token count:", max(token_count))
Note:
The model is expected to summarize the news from the week by identifying the top three positive and negative events that are most likely to impact the price of the stock.
As an output, the model is expected to return a JSON containing two keys, one for Positive Events and one for Negative Events.
import json
# defining a function to parse the JSON output from the model
def extract_json_data(json_str):
try:
# Find the indices of the opening and closing curly braces
json_start = json_str.find('{')
json_end = json_str.rfind('}')
if json_start != -1 and json_end != -1:
extracted_category = json_str[json_start:json_end + 1] # Extract the JSON object
data_dict = json.loads(extracted_category)
return data_dict
else:
print(f"Warning: JSON object not found in response: {json_str}")
return {}
except json.JSONDecodeError as e:
print(f"Error parsing JSON: {e}")
print(f"Input string: {json_str}")
return {}
#Defining the response function
def response_llm(prompt):
model_output = llama_model(
prompt,
max_tokens=2048,
temperature=0.1,
top_p=0.95,
top_k=50,
repeat_penalty=1.2,
stop=['INST'],
echo=False,
)
final_output = model_output["choices"][0]["text"]
return final_output
def generate_prompt(news, instructions = ""):
if instructions == "":
instructions = f"""
You are a news analyst specialized in analysing stock market news.
You will:
- classify the sentiment of the news article either postive or negative. Classify the news positive if it has positive impact on the stock price. Classify the news negative if it has negative impact on the stock price.
- Rate the news from -10 (most negative) to +10 (most positive)
- format the result in JSON format as below:
{{
"news" : "{news}",
"sentiment":"predicted sentiment of the news",
"sentiment_score": "number between -10 and 10",
}}
"""
prompt = """
{news}
[INST]<<SYS>>
{instructions}
<</SYS>>[/INST]
""".format(news=news, instructions=instructions)
return prompt
weekly_news_articles = weekly_data["News"][0].split("||")
test_prompt_1 = generate_prompt(weekly_news_articles[0])
print(test_prompt_1)
# checking the prompt and llm result
for i in range(3):
prompt = generate_prompt(weekly_news_articles[i])
print(prompt)
llm_response = response_llm(prompt)
print(extract_json_data(llm_response))
def generate_sent_analysis_prompt(news):
instructions = f"""
You are a news analyst specialized in analysing stock market news.
You will:
- classify the sentiment of the news article either postive or negative. Classify the news positive if it has positive impact on the stock price. Classify the news negative if it has negative impact on the stock price.
- format the result in JSON format as below:
{{
"news" : "{news}",
"sentiment":"predicted sentiment of the news"
}}
"""
return generate_prompt(news, instructions)
# checking the prompt and llm result
for i in range(3):
prompt = generate_sent_analysis_prompt(weekly_news_articles[i])
print(prompt)
llm_response = response_llm(prompt)
print(extract_json_data(llm_response))
# array of news per week
weekly_news_articles = [weekly_data["News"][i].split("||") for i in range(weekly_data.shape[0])]
# iterate through each news by week and classify their sentiment
news_dict = {}
for i in range(weekly_data.shape[0]):
weekly_date = weekly_data["Date"][i].strftime("%Y-%m-%d")
if weekly_date not in news_dict:
news_dict[weekly_date] = {"positive_events": [], "negative_events": []}
print(f"Processing {len(weekly_news_articles[i])} news for week: {weekly_date}")
for index, item in enumerate(weekly_news_articles[i]):
print(f"Processing news index: {index}")
prompt = generate_sent_analysis_prompt(item)
json_response = extract_json_data(response_llm(prompt))
if "sentiment" not in json_response:
print("Warning: llama 2 failed to predict sentiment for this news:", item)
continue
if json_response['sentiment'].lower() == 'positive':
news_dict[weekly_date]["positive_events"].append(json_response)
elif json_response['sentiment'].lower() == 'negative':
news_dict[weekly_date]["negative_events"].append(json_response)
# save the file so we don't have to run it from scratch every time
file_path = f"/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/llama_news_sentiment.json"
try:
with open(file_path, "w") as f:
json.dump(news_dict, f, indent=4)
print(f"Dictionary saved at: {file_path}")
except Exception as e:
print(f"Error saving JSON file: {e}")
# print output was used for debugging and deleted before saving the file as html
top_pos_news_instructions = """
You are a news analyst specialized in analysing stock market news.
You will:
- identify top 3 positive events from the list of positive news given in JSON format.
- output only in the following JSON format:
{
"date": "date of the news",
"positive_events" : [
{
"news": "top most positive news event"
},
{
"news": "2nd most positive news event"
},
{
"news": "3rd most positive news event"
}
]
}
- not add extra text before or after the JSON.
- not number the events (e.g., `1., 2., 3.`).
"""
top_neg_news_instructions = """
You are a news analyst specialized in analysing stock market news.
You will:
- identify top 3 negative events from the list of negative news given in JSON format.
- output only in the following JSON format:
{
"date": "date of the news",
"negative_events" : [
{
"news": "top most negative news event"
},
{
"news": "2nd most negative news event"
},
{
"news": "3rd most negative news event"
}
]
}
- not add extra text before or after the JSON.
- not number the events (e.g., `1., 2., 3.`).
"""
# load saved file from disk
file_path = "/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/llama_news_sentiment.json"
try:
with open(file_path, "r") as f:
news_dict = json.load(f)
except Exception as e:
print(f"Error loading JSON file: {e}")
# testing prompt only first week of negative news content
# Get the first key-value pair
key, value = next(iter(news_dict.items()))
neg_dict = {
"date": key,
"negative_events": [event["news"] for event in value["negative_events"]]
}
json_string = json.dumps(neg_dict, indent=4)
prompt = generate_prompt(json_string, top_neg_news_instructions)
llm_response = response_llm(prompt)
extract_json_data(llm_response)
# iterate through each weeks data and build the json for top 3 positive and negative news event
weekly_top_news = {}
for key, value in news_dict.items():
print(f"Processing negative news for week: {key}")
neg_dict = {
"date": key,
"negative_events": [event["news"] for event in value["negative_events"]]
}
json_string = json.dumps(neg_dict, indent=4)
prompt = generate_prompt(json_string, top_neg_news_instructions)
llm_response = extract_json_data(response_llm(prompt))
if key not in weekly_top_news:
weekly_top_news[key] = {}
weekly_top_news[key]["negative_events"] = llm_response.get("negative_events", [])
print(f"Processing positive news for week: {key}")
pos_dict = {
"date": key,
"positive_events": [event["news"] for event in value["positive_events"]]
}
json_string = json.dumps(pos_dict, indent=4)
prompt = generate_prompt(json_string, top_pos_news_instructions)
llm_response = extract_json_data(response_llm(prompt))
weekly_top_news[key]["positive_events"] = llm_response.get("positive_events", [])
# save the file so we don't have to run it from scratch every time
file_path = f"/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/top_news.json"
try:
with open(file_path, "w") as f:
json.dump(weekly_top_news, f, indent=4)
print(f"Dictionary saved at: {file_path}")
except Exception as e:
print(f"Error saving JSON file: {e}")
# print output was for debugging purpose, and was deleted before converting to html
top_news_json = json.dumps(weekly_top_news, indent=4)
print(top_news_json)
len(llama_model.tokenize(top_news_json.encode("utf-8"), add_bos=True))
# load saved file from disk
file_path = "/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/top_news.json"
try:
with open(file_path, "r") as f:
weekly_top_news = json.load(f)
except Exception as e:
print(f"Error loading JSON file: {e}")
def get_news_summary_instructions(date):
return f"""
You are a news analyst specialized in analysing stock market news.
You will:
- summarise the news of a week based on the news given to you in JSON format.
- classify the overall sentiment for the week in following categories only:
1. postive - if the overall sentiment of the week has positive impact on stock price.
2. negative - if the overall sentiment of the week has negative impact on stock price.
3. neutral - if the overall sentiment of the week has no impact or mixed impact on stock price.
- output in Strict JSON Format (Follow This Exactly):
{{
"{date}":{{
"summary": "summary of weekly news",
"sentiment": "predicted sentiment"
}}
}}
- ensure the output is valid JSON.
- not add extra text before or after the JSON.
- not number the events (e.g., `1., 2., 3.`).
"""
# checking the prompt with the first week data
key, value = next(iter(weekly_top_news.items()))
first_week_news = json.dumps({
key: value
})
prompt = generate_prompt(first_week_news, get_news_summary_instructions(key))
print(prompt)
# llm output for the first week data
llm_response = response_llm(prompt)
print(llm_response)
# json output for the first week data only
json_response = extract_json_data(llm_response)
print(json_response)
# iterate through each week data and generate JSON response
for key, value in weekly_top_news.items():
print(f"Processing news for week: {key}")
prompt = generate_prompt(json.dumps({
key: value
}, indent=4), get_news_summary_instructions(key))
llm_response = extract_json_data(response_llm(prompt))
weekly_top_news[key]["weekly_summary"] = llm_response.get(key, {}).get("summary", "")
weekly_top_news[key]["weekly_sentiment"] = llm_response.get(key, {}).get("sentiment", "")
# save the file so we don't have to run it from scratch every time
file_path = f"/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/summary.json"
try:
with open(file_path, "w") as f:
json.dump(weekly_top_news, f, indent=4)
print(f"Dictionary saved at: {file_path}")
except Exception as e:
print(f"Error saving JSON file: {e}")
# print output was for debugging purpose only and was deleted before converting to html
# importing the libraries so we can run this section independently
import pandas as pd
import json
import numpy as np
file_path = "/content/drive/MyDrive/AI_ML_PGP/Projects/StockMarketSentimentAnalysisAndSummarization/summary.json"
try:
with open(file_path, "r") as f:
summary = json.load(f)
except Exception as e:
print(f"Error loading JSON file: {e}")
summary_data = []
for key, value in summary.items():
positive_news = " || ".join([event["news"] for event in value["positive_events"]])
negative_news = " || ".join([event["news"] for event in value["negative_events"]])
summary_data.append([key, positive_news, negative_news, value["weekly_summary"], value["weekly_sentiment"]])
summary_df = pd.DataFrame(summary_data, columns=["Date", "Top 3 Positive News", "Top 3 Negative News", "Weekly Summary", "Weekly Sentiment"])
summary_df
positive, negative and neutral.summary_df
# display with max column width
pd.set_option('display.max_colwidth', None)
summary_df.head(5)
Power Ahead